1 covid case

# county-level socialeconomic information
county_data <- fread("data/covid_county.csv")
# county-level COVID case and death
covid_rate <- fread("data/covid_rates.csv")
# county-level lockdown dates
covid_intervention <- fread("data/covid_intervention.csv")

1.1 understand the data

view(dfSummary(covid_rate),method = "render")
view(dfSummary(county_data),method = "render")

1.2 covid case trend

1.2.1 three states, by day and state

# unique(covid_rate$State)
covid.3state.day = covid_rate[State %in% c("New York","Washington","Florida"),.(
  cum_cases = sum(cum_cases),
  cum_deaths = sum(cum_deaths),
  week = mean(week)
),by = .(date,State)]
covid.3state.day = covid.3state.day[order(list(State,date))]

covid.3state.day[,new_cases := c(NA,diff(cum_cases)),by = State]
covid.3state.day[,date := ymd(date)]
ggplot(covid.3state.day,aes(x=date,y=new_cases,group=State,color=State))+
  geom_line()+
  scale_x_date(date_breaks = "3 month",date_labels = "%Y-%m")+
  labs(title = "New Cases Trends for NY, WA and FL")+
  theme_bw()

The biggest problem: daily variability is extremely high, making the trends zigzagging. Maybe there is an innate difference among days in a week because people have different work and life style in different days. We may wish to smooth out these excessive noise by aggregating in the weekly level.

1.2.2 (ii)

# check
covid_rate[State == "New York",length(County),by = .(State,date)] # problem: some counties do not report cases in some days. New added counties cause problems
##         State       date V1
##   1: New York 2020-03-12 17
##   2: New York 2020-03-13 18
##   3: New York 2020-03-14 21
##   4: New York 2020-03-15 25
##   5: New York 2020-03-16 28
##  ---                       
## 354: New York 2020-03-07  9
## 355: New York 2020-03-08 11
## 356: New York 2020-03-09 11
## 357: New York 2020-03-10 11
## 358: New York 2020-03-11 12

We find that the number of counties reporting cases every day is incresaing. We may want to adjust the total population accordingly.

covid.weekend = covid_rate[,.(
  cum_cases_weekend = cum_cases[length(cum_cases)],
  cum_deaths_weekend = cum_deaths[length(cum_deaths)],
  TotalPopEst2019.weekend = TotalPopEst2019[length(TotalPopEst2019)]
), by = .(week,State,County)]
covid.weekend = covid.weekend[order(list(State,County,week))]
covid.weekend[,":="(new_cases = c(NA,diff(cum_cases_weekend)),
                                 new_deaths = c(NA,diff(cum_deaths_weekend))),
                           by = .(State,County)]
covid.week = covid.weekend[!is.na(new_cases)&!is.na(new_deaths),.(
  new_cases = sum(new_cases),
  new_deaths = sum(new_deaths),
  TotalPopEst2019 = sum(TotalPopEst2019.weekend)
),by = .(State,week)]
covid.week[,weekly_case_per100k := new_cases/TotalPopEst2019*100000]

# data.ma = covid_rate[State == "Massachusetts"] # pay attention: week 33 of MA

ggplot(covid.week,aes(x=week,y=weekly_case_per100k,group=State,color=State))+
  geom_line()+
  labs(title = "Spaghetti Plots, Weekly New Cases")+
  theme_bw()

1.2.3 (iii)

1.2.4 (iv)

# summary(covid_intervention)
# some examples

covid_intervention[STATE == "New York"]
## Empty data.table (0 rows and 16 cols): FIPS,STATE,AREA_NAME,stay at home,>50 gatherings,>500 gatherings...
covid_intervention_state = covid_intervention[substr(FIPS,nchar(FIPS)-2,nchar(FIPS)) == "000"]

# NY
ggplot(covid.3state.day[State == "New York"],aes(x=date,y=new_cases))+
  geom_line()+
  scale_x_date(date_breaks = "3 month",date_labels = "%Y-%m")+
  geom_vline(xintercept = covid_intervention_state[STATE == "NY","stay at home"][[1,1]],color = "red",lty = 5)+
  labs(title = "New Cases Trends for NY")+
  theme_bw()

# FL
ggplot(covid.3state.day[State == "Florida"],aes(x=date,y=new_cases))+
  geom_line()+
  scale_x_date(date_breaks = "3 month",date_labels = "%Y-%m")+
  geom_vline(xintercept = covid_intervention_state[STATE == "FL","stay at home"][[1,1]],color = "red",lty = 5)+
  labs(title = "New Cases Trends for FL")+
  theme_bw()

The graphs for these two states indicate that the lockdown policy may have some positive effects on slowing down the spread of the virus.

1.3 covid death trend

1.3.1 Monthly deaths per 100k heatmap

# pay attention to numbers smaller than zero
covid_rate = covid_rate[order(FIPS,date)]
covid_rate[,":="(new_cases = c(NA,diff(cum_cases)),
                 new_deaths = c(NA,diff(cum_deaths)),
                 year = year(date),
                 month = month(date),
                year.month = round_date(date,"month")),by = FIPS]
covid.month = covid_rate[!is.na(new_cases)&!is.na(new_deaths),.(
  new_cases = sum(new_cases),
  new_deaths = sum(new_deaths),
  TotalPopEst2019 = TotalPopEst2019[length(TotalPopEst2019)]
),by = .(State,County,year,month)]
covid.month = covid.month[,.(
  new_cases = sum(new_cases),
  new_deaths = sum(new_deaths),
  TotalPopEst2019 = sum(TotalPopEst2019)
),by = .(State,year,month)]

covid.month[,monthly_death_per100k := new_deaths/TotalPopEst2019*100000]

Here we only give one example: 2020-9. The plots for all months are shown in (ii).

covid.death.plot.list = list()
setnames(covid.month,"State","state")
max_col = quantile(covid.month$monthly_death_per100k,1,na.rm = T)
min_col = quantile(covid.month$monthly_death_per100k,0,na.rm = T)
for (i in 2:12) {
  covid.death.plot.list[[i-1]] =
    plot_usmap(regions = "state",data = covid.month[year == 2020 & month == i],
               values = "monthly_death_per100k", exclude = c("Hawaii", "Alaska"),color = "black") + 
    scale_fill_distiller(
      palette = "Reds",direction = 1,
      name = "Number of New Covid Deaths per 100,000 People", 
      limits = c(min_col, max_col)) + 
    labs(title = paste0("New Covid Deaths, 2020-",i), subtitle = "Continental US States") +
    theme(legend.position = "right")
}

ggplotly(covid.death.plot.list[[8]])
# plot_usmap(regions = "state",data = covid.month[year == 2020 & month == i],
#                values = "monthly_death_per100k", exclude = c("Hawaii", "Alaska"),color = "black") + 
#     scale_fill_gradient(
#       low = "white", high = "red", 
#       name = "Number of New Covid Deaths per 100,000 People", 
#       label = scales::comma) + 
#     labs(title = paste0("New Covid Deaths, 2020-",i), subtitle = "Continental US States") +
#     theme(legend.position = "right")

1.3.2 (ii)

# ggplotly

test.long = covid.month[year == 2020,.(month,state,monthly_death_per100k)]
# test = test.long %>%
#   pivot_wider(names_from = month,values_from = monthly_death_per100k) %>%
#   mutate(state = tolower(state))
# 
# map.wide = test %>% 
#   left_join(map_data("state"),by = c("state" = "region"))
# map.wide = data.table(map.wide)
# map = map.wide[order(order)] %>% 
#   pivot_longer(cols = 2:13,names_to = "month",values_to = "monthly_death_per100k") %>% 
#   mutate(month = as.numeric(month))
# map = data.table(map)
# map = map[order(list(month,order))]
# 
# plot.test = ggplot(map,aes(x=long,y=lat,group=group))+
#   geom_polygon(aes(fill = monthly_death_per100k))+
#   geom_path()+
#   scale_fill_distiller(palette = "Reds", direction = 1,
#                        name = "Num", 
#       limits = c(min_col, max_col))+
#   labs(title = paste0("New Monthly Covid Deaths per 100k"), subtitle = "Continental US States")+
#   theme_bw()
# ggplotly(plot.test) # error?

# plotly
abbr = unique(us_map(regions =
"states") %>% select(abbr, full)) 

plotly.data = covid.month[year == 2020,.(month,state,monthly_death_per100k)] %>%
  mutate(hover =  paste(state, '<br>', 
                        'new covid deaths', round(monthly_death_per100k, 3))) %>% 
  left_join(abbr,by = c("state" = "full"))

# give state boundaries a white border
l <- list(color = toRGB("white"), width = 2)
# specify some map projection/options
g <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white'),
  exclude = c("")
)

fig <- plot_geo(plotly.data, locationmode = 'USA-states')
fig <- fig %>% add_trace(
    z = ~monthly_death_per100k, text = ~hover, locations = ~abbr,
    color = ~monthly_death_per100k, colors = 'Reds',
    zmin = min_col, zmax = max_col,frame = ~month
  )

fig <- fig %>% colorbar(title = "Monthly new covid deaths of state")
fig <- fig %>% layout(
    title = 'Monthly new covid deaths of state',
    geo = g,
    hoverlabel = list(bgcolor="white")
  )
fig
save.image("codes/hw3_covid.RData")
load("codes/hw3_covid.RData")

1.4 covid factor

county.covid = covid_rate[date == as.Date("2021-02-01"),.(
  FIPS,County,State,
  total_death_per100k = cum_deaths/TotalPopEst2019*100000
)] %>% 
  right_join(county_data, by = "FIPS") %>%
  mutate(log_total_death_per100k = log(total_death_per100k + 1))

county.covid.sub <- county.covid %>%
  select(log_total_death_per100k,State.x,County.x, FIPS, Deep_Pov_All, PovertyAllAgesPct, PerCapitaInc, UnempRate2019, PctEmpFIRE, PctEmpConstruction, PctEmpTrans, PctEmpMining, PctEmpTrade, PctEmpInformation, PctEmpAgriculture, PctEmpManufacturing, PctEmpServices, PopDensity2010, OwnHomePct, Age65AndOlderPct2010, TotalPop25Plus, Under18Pct2010, Ed2HSDiplomaOnlyPct, Ed3SomeCollegePct, Ed4AssocDegreePct, Ed5CollegePlusPct, ForeignBornPct, Net_International_Migration_Rate_2010_2019, NetMigrationRate1019, NaturalChangeRate1019, TotalPopEst2019, WhiteNonHispanicPct2010, NativeAmericanNonHispanicPct2010, BlackNonHispanicPct2010, AsianNonHispanicPct2010, HispanicPct2010, Type_2015_Update, RuralUrbanContinuumCode2013, UrbanInfluenceCode2013, Perpov_1980_0711, HiCreativeClass2000, HiAmenity, Retirement_Destination_2015_Update)

setnames(county.covid.sub,c("State.x","County.x"),c("state","county"))

Num of missings in every variable

# county.covid.sub[is.na(state)]
apply(is.na(county.covid.sub), 2, sum) # missing in state, county: all aggregate states, puerto rico, hawaii, alaska; bedford in Virginia has two obs, duplicated
##                    log_total_death_per100k 
##                                        171 
##                                      state 
##                                        171 
##                                     county 
##                                        171 
##                                       FIPS 
##                                          0 
##                               Deep_Pov_All 
##                                          6 
##                          PovertyAllAgesPct 
##                                         86 
##                               PerCapitaInc 
##                                          6 
##                              UnempRate2019 
##                                          7 
##                                 PctEmpFIRE 
##                                          7 
##                         PctEmpConstruction 
##                                          7 
##                                PctEmpTrans 
##                                          7 
##                               PctEmpMining 
##                                          7 
##                                PctEmpTrade 
##                                          7 
##                          PctEmpInformation 
##                                          7 
##                          PctEmpAgriculture 
##                                          7 
##                        PctEmpManufacturing 
##                                          7 
##                             PctEmpServices 
##                                          7 
##                             PopDensity2010 
##                                          6 
##                                 OwnHomePct 
##                                          6 
##                       Age65AndOlderPct2010 
##                                          6 
##                             TotalPop25Plus 
##                                          6 
##                             Under18Pct2010 
##                                          6 
##                        Ed2HSDiplomaOnlyPct 
##                                          6 
##                          Ed3SomeCollegePct 
##                                          6 
##                          Ed4AssocDegreePct 
##                                          6 
##                          Ed5CollegePlusPct 
##                                          6 
##                             ForeignBornPct 
##                                         85 
## Net_International_Migration_Rate_2010_2019 
##                                         85 
##                       NetMigrationRate1019 
##                                         85 
##                      NaturalChangeRate1019 
##                                         85 
##                            TotalPopEst2019 
##                                          6 
##                    WhiteNonHispanicPct2010 
##                                          6 
##           NativeAmericanNonHispanicPct2010 
##                                          6 
##                    BlackNonHispanicPct2010 
##                                          6 
##                    AsianNonHispanicPct2010 
##                                          6 
##                            HispanicPct2010 
##                                          6 
##                           Type_2015_Update 
##                                        136 
##                RuralUrbanContinuumCode2013 
##                                         57 
##                     UrbanInfluenceCode2013 
##                                         57 
##                           Perpov_1980_0711 
##                                        136 
##                        HiCreativeClass2000 
##                                        140 
##                                  HiAmenity 
##                                        172 
##         Retirement_Destination_2015_Update 
##                                        136
county.covid.sub = na.omit(county.covid.sub) # -174

1.4.1 lasso

set.seed(1)
model.var = model.matrix(log_total_death_per100k~.,data = county.covid.sub[,-c("FIPS","county")])[,-1] # no Alabama here
# head(model.test)
death = county.covid.sub[,log_total_death_per100k]
fit1.cv = cv.glmnet(model.var,death,alpha = 1, nfolds = 10, intercept = T,
                    penalty.factor = c(rep(0,48),rep(1,ncol(model.var)-48))) # alpha: the para in elastic net
coef.min = coef(fit1.cv,s="lambda.1se") # more parsimonious usually. If use min, maybe there is noise
# coef(fit1.cv,s="lambda.1min")
nonzero.coef = coef.min[which(coef.min!=0),]
# plot(fit1.cv)
nonzero.var = names(nonzero.coef)[-1]

1.4.2 Fine tune the model

Relaxed lasso result

data.selected = data.table(death,model.var[,which(colnames(model.var) %in% nonzero.var)])

fit.1se.lm = lm(death~.,data = data.selected)
# relaxed lasso
stargazer(fit.1se.lm,type=output_format, align=TRUE)
Dependent variable:
death
stateArizona 0.267
(0.238)
stateArkansas 0.056
(0.135)
stateCalifornia -0.771***
(0.159)
stateColorado -0.357**
(0.153)
stateConnecticut 0.178
(0.303)
stateDelaware -0.024
(0.469)
stateDistrict of Columbia 0.520
(0.807)
stateFlorida 0.031
(0.147)
stateGeorgia 0.066
(0.117)
stateIdaho -0.322*
(0.166)
stateIllinois 0.211
(0.132)
stateIndiana -0.074
(0.134)
stateIowa 0.287**
(0.134)
stateKansas -0.628***
(0.137)
stateKentucky -0.756***
(0.128)
stateLouisiana 0.375***
(0.143)
stateMaine -1.370***
(0.225)
stateMaryland -0.059
(0.196)
stateMassachusetts 0.077
(0.240)
stateMichigan -0.147
(0.135)
stateMinnesota -0.157
(0.138)
stateMississippi 0.264**
(0.132)
stateMissouri -0.378***
(0.129)
stateMontana 0.547***
(0.158)
stateNebraska -0.386***
(0.143)
stateNevada -0.577**
(0.227)
stateNew Hampshire -0.806***
(0.273)
stateNew Jersey 0.329
(0.209)
stateNew Mexico -0.653***
(0.192)
stateNew York -0.367**
(0.151)
stateNorth Carolina -0.368***
(0.127)
stateNorth Dakota 0.950***
(0.166)
stateOhio -0.523***
(0.134)
stateOklahoma -0.374***
(0.137)
stateOregon -0.865***
(0.174)
statePennsylvania 0.039
(0.146)
stateRhode Island -0.177
(0.372)
stateSouth Carolina -0.011
(0.153)
stateSouth Dakota 0.809***
(0.151)
stateTennessee 0.058
(0.130)
stateTexas 0.168
(0.127)
stateUtah -1.090***
(0.196)
stateVermont -2.140***
(0.239)
stateVirginia -0.476***
(0.123)
stateWashington -0.857***
(0.168)
stateWest Virginia -0.678***
(0.155)
stateWisconsin -0.134
(0.140)
stateWyoming 0.208
(0.205)
PovertyAllAgesPct 0.003
(0.005)
PerCapitaInc -0.00001
(0.00001)
PctEmpConstruction -0.049***
(0.007)
PctEmpMining -0.019***
(0.006)
PctEmpAgriculture -0.039***
(0.004)
PctEmpManufacturing 0.003
(0.003)
PopDensity2010 0.00003***
(0.00001)
Age65AndOlderPct2010 0.034***
(0.008)
Under18Pct2010 0.060***
(0.008)
Ed3SomeCollegePct -0.021***
(0.005)
Ed5CollegePlusPct -0.011***
(0.004)
NetMigrationRate1019 -0.012***
(0.003)
NaturalChangeRate1019 -0.044***
(0.010)
WhiteNonHispanicPct2010 -0.003*
(0.002)
HispanicPct2010 0.009***
(0.002)
Type_2015_Update -0.019**
(0.009)
Constant 4.530***
(0.417)
Observations 3,105
R2 0.359
Adjusted R2 0.345
Residual Std. Error 0.790 (df = 3040)
F Statistic 26.600*** (df = 64; 3040)
Note: p<0.1; p<0.05; p<0.01

BIC graphs

fit.final.1 =  regsubsets(death~.,data = data.selected,method = "exhaustive",
                          nvmax = ncol(data.selected)-1,force.in = c(1:48),really.big = T) # compared to Arizona

summary.fit.final.1 = summary(fit.final.1)
plot(summary.fit.final.1$bic)

opt.index = 10 # with bic

We choose \(p=10\) by BIC criteria.

Final model after fine tuning

bic.var.select = summary.fit.final.1$which[opt.index,-1]
bic.var = names(bic.var.select)[which(bic.var.select)] 
# bic.var
final.expr = as.formula(paste("death", "~", paste(bic.var, collapse = "+"))) 
fit.final.2 = lm(final.expr,data = data.selected)

stargazer(fit.final.2,type=output_format, align=TRUE)
Dependent variable:
death
stateArizona 0.208
(0.237)
stateArkansas 0.021
(0.134)
stateCalifornia -0.863***
(0.155)
stateColorado -0.434***
(0.150)
stateConnecticut 0.070
(0.299)
stateDelaware -0.099
(0.469)
stateDistrict of Columbia 0.657
(0.804)
stateFlorida -0.016
(0.142)
stateGeorgia 0.053
(0.116)
stateIdaho -0.399**
(0.163)
stateIllinois 0.107
(0.127)
stateIndiana -0.183
(0.128)
stateIowa 0.178
(0.129)
stateKansas -0.706***
(0.133)
stateKentucky -0.841***
(0.121)
stateLouisiana 0.371***
(0.141)
stateMaine -1.490***
(0.222)
stateMaryland -0.157
(0.191)
stateMassachusetts -0.005
(0.238)
stateMichigan -0.237*
(0.132)
stateMinnesota -0.268**
(0.133)
stateMississippi 0.309**
(0.131)
stateMissouri -0.459***
(0.123)
stateMontana 0.485***
(0.155)
stateNebraska -0.482***
(0.138)
stateNevada -0.647***
(0.226)
stateNew Hampshire -0.950***
(0.271)
stateNew Jersey 0.254
(0.203)
stateNew Mexico -0.725***
(0.190)
stateNew York -0.429***
(0.142)
stateNorth Carolina -0.387***
(0.126)
stateNorth Dakota 0.848***
(0.161)
stateOhio -0.632***
(0.129)
stateOklahoma -0.400***
(0.136)
stateOregon -0.932***
(0.172)
statePennsylvania -0.068
(0.140)
stateRhode Island -0.287
(0.370)
stateSouth Carolina -0.001
(0.152)
stateSouth Dakota 0.738***
(0.148)
stateTennessee -0.003
(0.128)
stateTexas 0.119
(0.126)
stateUtah -1.200***
(0.188)
stateVermont -2.280***
(0.236)
stateVirginia -0.516***
(0.121)
stateWashington -0.923***
(0.167)
stateWest Virginia -0.792***
(0.148)
stateWisconsin -0.252*
(0.136)
stateWyoming 0.106
(0.203)
PctEmpConstruction -0.057***
(0.007)
PctEmpMining -0.024***
(0.005)
PctEmpAgriculture -0.041***
(0.003)
Age65AndOlderPct2010 0.032***
(0.008)
Under18Pct2010 0.060***
(0.007)
Ed3SomeCollegePct -0.024***
(0.005)
Ed5CollegePlusPct -0.016***
(0.002)
NetMigrationRate1019 -0.014***
(0.002)
NaturalChangeRate1019 -0.040***
(0.010)
HispanicPct2010 0.011***
(0.002)
Constant 4.520***
(0.267)
Observations 3,105
R2 0.354
Adjusted R2 0.342
Residual Std. Error 0.792 (df = 3046)
F Statistic 28.800*** (df = 58; 3046)
Note: p<0.1; p<0.05; p<0.01
# all significant

Age65AnOlderPct2010 has a significantly positive coefficient. However, Under18Pct2010 also has a significantly positive coefficient and is larger than that for elderly. This does not give strong support to the argument that covid affects the elderly the most; we would rather interpret it as, covid affect the middle ages the least.

BlackPct is not in the regression after controlling for existing variables while HispanicPct is in the regression. The coefficient for it is significantly positive, indicating that a higher Hispanic percentage in the region is connected with a higher fatal rate of covid. The analysis gives some support on a higher fatal rate in Hispanic group; it is not very clear for the black group.

1.4.3 diagnosis

scatter.list = list()

for (i in 49:58) { # cannot do this. the plot is built only after it's invoked if in the loop? Use aes_string
  name = bic.var[i]
  scatter.list[[i-48]] = ggplot(data.selected,aes_string(x=name,y="death"))+
    geom_point()+
    geom_smooth(method = "lm")+
    xlab(name)+
    ylab("log.new.death")+
    theme_bw()
}

Scatter Plots

plot_grid(plotlist = scatter.list)

# hist(county.covid$total_death_per100k,breaks = seq(0,900,10))

Residual Plot

# residual plot
plot(fit.final.2,1)

QQ Plot

# qq plot
plot(fit.final.2,2)

Seems not a good fit. Homoscedasticity is not well satisfied from the residual plot; from the residual plot we can see that the residuals have much thicker tails than the normal distribution.

1.4.4 Improvements

From the scatter plots presented above, we can see that there are a lot of counties with zero in log total covid death per 100k and they are distant to other observations. Therefore, we may consider mixture models (for example, zero-inflated model) and classify them into two groups first, then make inference within every group.

As with the possible important variables, medical conditions could be one of them. Also total number of death per 100k may not be a perfect measure for our goal because it contains a part of randomness, i.e., the spread of covid in an area can have some random determinants; the virus might unexpectedly break out in some areas and result in a higher infection rate and mortality rate. We may want to complement the analysis with another dependent variable like \(\frac{TotalDeaths}{TotalCases}\).

1.4.5 Possible Imputations

Missing values are clustered in Puerto Rico, Alaska. These states are different in property with continent states so we may not trust the imputation results.